

### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: (run below)
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime

### Script purpose: SELECTS columns and rows to keep, conducts basic column name cleaning 
### and conversions (dates, strings) across the whole dataset

### Execution time: ~60  minutes (mostly string-cleaning loop)

### Inputs: 
# 1) /data/raw_[format1]/country_combined.[format1]
# *) /data/raw_[format1]/country_combined_sample10.[format1] (if running for a 10% sample for tests)

### Outputs:
# 1) /data/intermediate/country_03_limpiar_conjunto.[format1]
# *) /data/intermediate_temp/country_03_limpiar_conjunto (temp1).[format1] [only temporary file to avoid re-running full script in case of an error
# not necessary for executing the script]


#
# SET-UP -------------------------------------------------------------------------------------------------------------------------------
#

rm(list=ls())

### Source the '00_global.R' script with required packages and functions
source(file.path(dirname(rstudioapi::getActiveDocumentContext()$path), '00_country_global.R'))


# library(installr)
# updateR()

# Make a copy of the file
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)




# ' ------------------------------------------------------------------------------------------------------------------------------
# READ FULL DATA ----------------------------------------------------------------------------------------------------------------------
#
t0 = Sys.time() # record start time




country = read_flex(file.path('data', paste0('raw_', format1), 'country_combined'), format = format1)



## set as DT if not already done
if(!any(grepl('data.table', class(country)))){setDT(country)}
gc()

print(paste0('Dataset dimensions'))
print(dim(country))




# ' -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
# BASE CLEANING ACROSS DATASET ----------------------------------------------------------------------------------------------------------------------------------
#


### rename columns  -----------------------------------------------------------------------------------------------------------------------------------------------
# NOTE: keep as close to the original, but iron out some inconsistencies
# like some pay columns being called 'renumeracion' while other just 'remu' 

country = country %>% 
  rename(
    tipo_calificacion = any_of('tipo_calificacionp'),
    jornada = any_of('grado_eus'), # that should stand for EUS = Escala Única de Sueldos
    
    # also all 8 pay (values + currency unit)
    unidad_pago_bruto      = any_of('tipo_unidad_monetaria'),
    pago_bruto             = any_of('remuneracionbruta_mensual'),
    unidad_pago_liquido    = any_of('tipo_unidad_monetaria_remuneracion_liquida'),
    pago_liquido           = any_of('remuliquida_mensual'),
    unidad_pago_adicional  = any_of('tipo_unidad_monetaria_remuneracion_adicional'),
    pago_adicional         = any_of('remu_adicional'),
    unidad_pago_incentivos = any_of('tipo_unidad_monetaria_remuneracion_bonos_incentivos'),
    pago_incentivos        = any_of('remu_bonoin'),
    unidad_pago_viatico    = any_of('tipo_unidad_monetaria_viaticos'),
    pago_viatico           = any_of('viaticos'),

    unidad_pago_horas_diurnas   = any_of('tipo_de_unidad_monetaria_horas_diurnas'),
    unidad_pago_horas_nocturnas = any_of('tipo_de_unidad_monetaria_horas_nocturnas'),
    unidad_pago_horas_festivas  = any_of('tipo_de_unidad_monetaria_horas_festivas'),
    pago_horas_diurnas   = any_of('pago_extra_diurnas'),
    pago_horas_nocturnas = any_of('pago_extra_nocturnas'),
    pago_horas_festivas  = any_of('pago_extra_festivas'),
  )


### (*) select a sample of observations with gross < net pay for Christian
# pr(country$pago_bruto - country$pago_liquido < 0) 
# set.seed(123)
# rows1=sample(which(country$pago_bruto - country$pago_liquido < 0,), 15, replace = F)
# View(country[rows1,])
# fwrite(country[rows1,], na = NA, row.names = F,
#        file.path(main_dir, 'Data', 'gross_below_net.csv'))


### remove columns  -------------------------------------------------------------------------------------------------------------------------------
names(country)

country = country %>% 
  select(-any_of(c( 
    'unidad_monetaria_remuneracion_liquida',
    'exchange',
    'report_pdf',
    'desc_otrpago', 'tipo_unidad_monetaria_remuneracion_monto_desvinculacion',
    'monto_desvin',
    'jornada',
    'horasextra', 'horas_extra', 'horas_extra_diurnas', 'horas_extra_nocturnas', 'horas_extra_festivas'
)))


### + immutable row id --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# NOTE: assign value to each row in this very broadly cleaned dataset to allow matching row-by-row in subsequent scripts
country = country[, row_id_org := .I]

country = country %>% relocate('row_id_org', .before = 1) # move to the 1st position to keep the variable clearly there in column names




# ' ------------------------------------------------------------------------------------------------------------------------------------------------------------------
# CURRENCIES -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#

### checks -> what share of obs has non-peso currency?
# pr(country$unidad_pago_bruto) # 0.0013% in other currencies (Dólares, UTM)



# read pre-prepared file with exchange rates for relevant currencies and years
currencies = read.xlsx(file.path(main_dir, 'data', 'clean', 'additional_data', 'country_currency.xlsx')) %>% 
  dplyr::select(1:3) %>% mutate(exchange = as.numeric(exchange)) %>% setDT()


# all unidad columns should only have either one of currencies listed in the 'currencies' OR one of: no aplica NA + [empty string]
unidad_cols = names(country)[grepl('unidad', names(country))]


### clean units columns
gc()
country[, (unidad_cols) := lapply(.SD, function(x) {
  x <- tolower((x)) # lowercase
  x <- fifelse(x %in% unique(currencies$currency), x, 'pesos') # assign 
  x
}), .SDcols = unidad_cols]


# (*) checks -> units of payment
pr_na(country$unidad_pago_bruto)

units_all = unique(tolower(c(
  funique(country$unidad_pago_bruto),
  funique(country$unidad_pago_liquido),
  funique(country$unidad_pago_adicional),
  funique(country$unidad_pago_incentivos),
  funique(country$unidad_pago_viatico),
  funique(country$unidad_pago_horas_diurnas),
  funique(country$unidad_pago_horas_nocturnas),
  funique(country$unidad_pago_horas_festivas)
)))

### all non-missing units in the 'currencies' dataframe
if(!all(units_all[!units_all  %in% c('no aplica', '', NA)] %in% currencies$currency)){
  stop(print("Not all currency units defined. 
       Please adjust 'data/clean/additional_data/country_currency.xlsx' file"))
}

# create function that will take the relevant dataset, currency unit and corresponding pay value and convert it
currency_col = 'unidad_pago_bruto'
pay_col = 'pago_bruto'


country_foreign = country[ rowSums(country[, lapply(.SD, \(x) x != "pesos"), .SDcols = unidad_cols]) > 0 ]
country = country[!row_id_org %in% country_foreign$row_id_org]

tapply(country_foreign$pago_bruto, country_foreign$unidad_pago_bruto, summary)


convert_currency = function(dta, currency_col, pay_col){
  
  names(currencies)[1] = currency_col

  dta <- dta %>%
    left_join(currencies) %>%
    mutate({{pay_col}} := {{pay_col}} * as.numeric(exchange))

  # If new currencies / years found that are not yet covered -> print them
  # miss_exchange = unique(dta[is.na(dta$exchange) & !is.na(dta$anyo), ..currency_col])
  # 
  # if(nrow(miss_exchange) > 0){
  #   print(paste0('No exchange rate for: ', 
  #                unique(dta$tipo_unidad_monetaria[is.na(dta$exchange) & !is.na(dta$anyo)])))
  # }
  
  return(dta)
}


# apply function to all pay columns
country_foreign = convert_currency(country_foreign, 'unidad_pago_bruto', pago_bruto)
country_foreign = convert_currency(country_foreign, 'unidad_pago_liquido', pago_liquido)
country_foreign = convert_currency(country_foreign, 'unidad_pago_adicional', pago_adicional)
country_foreign = convert_currency(country_foreign, 'unidad_pago_incentivos', pago_incentivos)
country_foreign = convert_currency(country_foreign, 'unidad_pago_viatico', pago_viatico)
country_foreign = convert_currency(country_foreign, 'unidad_pago_horas_diurnas', pago_horas_diurnas)
country_foreign = convert_currency(country_foreign, 'unidad_pago_horas_nocturnas', pago_horas_nocturnas)
country_foreign = convert_currency(country_foreign, 'unidad_pago_horas_festivas', pago_horas_festivas)
gc()


beep()

### re-combine rows without unidad cols
country = country %>% select(-unidad_cols)
country_foreign = country_foreign %>% select(-unidad_cols)

# dummy if any payment made in foreign currency (temporary)
country[, unidad_extranjero := F]
country_foreign[, unidad_extranjero := T]
country_foreign[, exchange := NULL]

setdiff(names(country_foreign), names(country))
country = rbindlist(list(country, country_foreign))

### (*) checks -> distribution of pay in pesos vs in foreign currency
## R: median ok, but mean for foreign currency crazy high due to outliers (keep for now as we winsorize anyway and repeat after doing that)
tapply(country$pago_bruto, country$unidad_extranjero, summary)


# ' -----------------------------------------------------------------------------------------------------------------------------------------------------------
# ALL CHARACTER VARIABLES  -------------------------------------------------------------------------------------------------------------------------
#


### mutate all remaining character strings...
for( col in names(country)[sapply(country, is.character)] ){
  
  print(col)
  
  # don't run for those columns to save time
  if(col %in% c('dataset', 'anyo', 'mes', unidad_cols)){next}
  
  unique_vals <- funique(country[[col]])  # Get unique values
  cleaned_vals <- clean_text(unique_vals)  # Clean unique values
  mapping <- data.table(original = unique_vals, cleaned = cleaned_vals)  # Create a mapping
  setnames(mapping, "original", paste0(col, "_old"))  # Rename original column
  setnames(mapping, "cleaned", col)  # Rename cleaned column
  
  # Merge cleaned values back into original dataframe
  # print(fdistinct(country %>% pull(col)))
  # print(pr_isna(country %>% pull(col)))
  
  setnames(country, col, paste0(col, '_old'))
  country <- mapping[country, on = paste0(col, '_old')]
  
  # print(fdistinct(country %>% pull(col)))
  # print(pr_isna(country %>% pull(col)))

  country = country %>% select(-c(matches('_old$')))
  gc()
  
  
}

country = country %>% select(-c(matches('_old$')))
gc()

  
write_flex(x = country, file.path(main_dir, 'data', 'intermediate_temp', "country_03_limpiar_conjunto (temp1)"), format = format1)




# 
# CLEAN DATES ----------------------------------------------------------------------------------------------------------
#


country = read_flex( file.path(main_dir, 'data', 'intermediate_temp', "country_03_limpiar_conjunto (temp1)"), format = format1)

## set as DT if not already done
if(!any(grepl('data.table', class(country)))){setDT(country)}
gc()

### clean anyo and mes  ---------------------------------------------------------------------------------------------------------- 
  
### checks [missingness] -> no
# sf(country$anyo %>% is.na)
# sf(country$mes %>%  is.na)

# checks [within expected range] -> most but not all
# pr_na(country$anyo > 2019 & 2024 < country$anyo)
# pr_na(country$mes < 1 & country$mes > 12)

country[, mes:=tolower(mes)]

country$mes %>% funique

# clean 'mes' (month) column (it's text rather than numeric -> all month names correct in Spanish)
mes_map <- c(
  "enero" = 1, "febrero" = 2, "marzo" = 3,
  "abril" = 4, "mayo" = 5, "junio" = 6,
  "julio" = 7, "agosto" = 8, "septiembre" = 9,
  "octubre" = 10, "noviembre" = 11, "diciembre" = 12
)

# update column in place
country[, mes := mes_map[mes]]


# substitute anything outside of the expected anyo and mes range with NAs
country = country[anyo >= start_year]
country = country[anyo <= end_year]
country = country[mes >= 1]
country = country[mes <= 12]


### remove observations with NA's for anyo/mes? xxx (the code above does that, but we can subset erroneous values to NA too)
pr_isna(country$anyo)

### add year-month column in date format ----------------------------------------------------------------------
# (helps with date-based operations below) 
country[, anyo_mes := ymd(stri_paste(anyo, ifelse(mes < 10, stri_paste("0", mes), mes), "01", sep = "-"))]



# ' ----------------------------------------------------------------------------------------------------------------------------------------------
# CONVERT DATES -----------------------------------------------------------------------------------------------------------------------------------
#


### date format  --------------------------------------------------------------------------------------------------------------------------

# define a function to convert the dates
date_column = '2024/10/14'

convert_to_date <- function(date_column) {
  
  # Try ymd and dmy, allowing for fallback between them
  date1 <- ymd(date_column, quiet = TRUE)
  
  # If ymd fails (produces NAs), try dmy where date1 is NA
  date1[is.na(date1)] <- dmy(date_column[is.na(date1)], quiet = TRUE)
  
  # Handle any remaining NA values by extracting date-like substrings
  remaining_na <- is.na(date1)
  if (any(remaining_na)) {
    # Extract date-like substrings (dd/mm/yyyy)
    extracted_dates <- str_extract(date_column[remaining_na], "\\d{1,2}[/-]\\d{1,2}[/-]\\d{4}")
    date1[remaining_na & !is.na(extracted_dates)] <- dmy(extracted_dates[!is.na(extracted_dates)])
    
    # Extract date-like substrings (yyyy/mm/dd)
    extracted_dates <- str_extract(date_column[remaining_na], "\\d{4}[/-]\\d{1,2}[/-]\\d{1,2}")
    date1[remaining_na & is.na(date1)] <- ymd(extracted_dates[!is.na(extracted_dates)])
  }
  
  as.Date(date1)
  ifelse(is.na(date1) | grepl('a-z', tolower(date1)), return(NA), return(as.Date(date1)))

}

class(country$fecha_ingreso)
country$fecha_ingreso[1:100]

names(country)[grepl('fecha', names(country))]



# apply to date columns
country[, fecha_ingreso := fifelse(str_detect(fecha_ingreso, '\\d{2}'), convert_to_date(fecha_ingreso), NA)]
country[, fecha_termino := fifelse(str_detect(fecha_termino, '\\d{2}'), convert_to_date(fecha_termino), NA)]
country[, fecha_publicacion := fifelse(str_detect(fecha_publicacion, '\\d{2}'), convert_to_date(fecha_publicacion), NA)]


### ? assign NA for impossible year and month (xxx but double-check with Christian)  -----------------------------------------------------------------------------------------------------------------------------------
country[, `:=`(
    fecha_ingreso = fifelse(fecha_ingreso < ymd("1950-01-01") | fecha_ingreso > end_date1, as.Date(NA), fecha_ingreso),
    fecha_termino = fifelse(fecha_termino < ymd("2010-01-01") | fecha_termino > end_date1, as.Date(NA), fecha_termino)
  )]

beep()

hist(country$fecha_ingreso, breaks = 'years')
hist(country$fecha_termino, breaks = 'year')

nrow(country)
nrow(country) == fdistinct(country$row_id_org)



#
# PREPARE FOR NEXT SCRIPTS -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
#


### +id based on names -------------------------------------------------------------------------------------------------------------------------------

# faster than cur_group_id or group_indices (works, but is depreciated) and results in the same number of IDs
country[, id := .GRP, by = .(nombres, paterno, materno)]

### > save for gender identification -----------------------------------------------------------------------------------------
## the materno, paterno, and nombres columns only needed in that file
write_flex(x =  unique(country[, .(id, nombres, paterno, materno)]),
           file.path(main_dir, 'data',  'intermediate_temp', "03_country_limpiar_conjunto (genero)"), format = format1)



### drop spare columns ------------------------------------------------------------------------------------------------------------------------------------------------------------


### checks > adjust columns needed based on the names of cols used in further scripts
# list 'country' cols used in a all higher scripts
cols_all = colnames(country) # list current columns in the full data file


cols_used = c() # create empty vector to store the names of cols used

# loop over all scripts to extract their content and cols used
for(i in list.files(path = file.path(main_dir, 'code'), pattern = "^(0[4-9]|[1-9][0-9])",  full.names = F)){
  
  # get columns used in each script
  script_cols  = cols_all[sapply(cols_all, function(c) any(grepl(c, tolower(readLines(file.path(main_dir, 'code', i))))))]
  
  # add them to the list of columns used in all higher-up scripts
  cols_used = sort(unique(c(cols_used, script_cols)))
  
  # save for each script to store subsets of data needed in each script seperately
  assign(paste0('script_cols_',  gsub('[^0-9]', '',i)), script_cols)
}

cols_used %>% sort
cols_all[!cols_all %in% cols_used] %>% sort

### list names of columns to remove 
cols_select1 = names(country %>%
  select(-c(starts_with('unidad_pago'))) %>% 
  select(-c(starts_with('tipo_de_unidad'))) %>% 
  select(-c(starts_with('horas_extra'))) %>% 
  select(-c(starts_with('horasextra'))) %>% 
  select(-c(starts_with('inflation'))) %>% 
  select(-c(starts_with('exchange'))) %>% 
  select(-any_of(c(
    'tipo_unidad_monetaria_remuneracion_monto_desvinculacion',
    'unidad_monetaria_remuneracion_liquida',
    'fecha_termino', 'fecha_ingreso',
    'fecha_publicacion', 'date_publish',
    'date_terminate', 'date_start',
    'materno', 'paterno', 'nombres'
  ))))

beep()


cols_select1



### divide between needed columns and those not needed at all or not for cleaning - keep 'row_id_org' to merge back as needed
country_in  = country %>% select(c(row_id_org, id, cols_select1))
country_out = country %>% select(c(row_id_org, !cols_select1))


### > save -----------------------------------------------------------------------------------------
gc()

# saving both the main file and the subset with not needed column as it is much faster to merge extra cols if they turn out
# to be required after all using row_id_org than re-run all scripts. 
write_flex(x = country_in, file.path(main_dir, 'data',  'intermediate', "country_03_limpiar_conjunto"), format = format1)
write_flex(x = country_out, file.path(main_dir, 'data',  'intermediate', "country_03_limpiar_conjunto (out)"), format = format1)
beep()


exec_time_fun('exec_time')



# ' ------------------------------------------------------------------------------------------------------------------------------
# FIN DEL CÓDIGO  --------------------------------------------------------------------------------------------
# 